{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# seq2seq模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "use_cuda = torch.cuda.is_available()                    # priority selection gpu\n",
    "device = torch.device('cuda' if use_cuda else 'cpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_LENGTH = 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 定义编码器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class EncoderRNN(nn.Module):\n",
    "    def __init__(self, vocab_size, hidden_size, n_layers=1):\n",
    "        \"\"\"\n",
    "        vocab_size: the size of vocabulary\n",
    "        hidden_size: the size of the RNN's hidden dimension\n",
    "        n_layers: the number of the RNN layers, default is 1，note the defination of the layers is outside the \n",
    "                  RNN, so the numbers of the hidden units is (1*direction,N,hidden)!!!                 \n",
    "        \"\"\"\n",
    "        super(EncoderRNN, self).__init__()\n",
    "        self.n_layers = n_layers\n",
    "        self.hidden_size = hidden_size\n",
    "        \n",
    "        # embedding the words from one-hot to dense vector\n",
    "        embedding_size = hidden_size\n",
    "        self.embedding = nn.Embedding(vocab_size, embedding_size)\n",
    "        # define the GRU unit\n",
    "        self.gru = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, batch_first=True)\n",
    "        \n",
    "    def forward(self, input_t, hidden_t):\n",
    "        # (N,seq_size) => (N,seq_size,embedding_size)\n",
    "        embedded = self.embedding(input_t)                        # [1,10]=>[1, 10, 128]\n",
    "#         print(embedded.shape)\n",
    "        for i in range(self.n_layers):                            # multi layer \n",
    "            embedded, hidden_t = self.gru(embedded, hidden_t)\n",
    "        # embedded: (N,seq,embedding_size)  hidden_t: (1*direction, N, hidden)      \n",
    "#         print(embedded.shape, hidden_t.shape)                   # torch.Size([1, 0, 128]) torch.Size([1, 1, 128])\n",
    "        return embedded, hidden_t\n",
    "    \n",
    "    def initHidden(self):\n",
    "        # (direction, batch, hidden) note batch is always 1，the way that gru is defined !!! \n",
    "        hidden_0 = torch.zeros(1, 1, self.hidden_size)\n",
    "        return hidden_0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.Input tensor: torch.Size([1, 10])\n",
      "2.outpu shape: torch.Size([1, 10, 128]) hidden shape: torch.Size([1, 1, 128])\n"
     ]
    }
   ],
   "source": [
    "# test the encode model\n",
    "test_in = torch.randint(1,1000, [1,10]).long().to(device)        # input before Embedding must be LongTensor \n",
    "print('1.Input tensor:', test_in.shape)\n",
    "encoder = EncoderRNN(1000, 128, 2).to(device)\n",
    "hidden_0 = encoder.initHidden().to(device)\n",
    "output, hidden = encoder(test_in, hidden_0)                      # output:(batch,seq,hidden*direction)\n",
    "print('2.outpu shape:', output.shape, 'hidden shape:', hidden.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "总结\n",
    "\n",
    "- **batch_first参数只对输入和输出的batch和seq的顺序有影响，输入维度变为(batch,seq,feature)，输出维度(batch,seq,hidden*direction) **\n",
    "\n",
    "**对隐藏状态的维度顺序无影响，仍旧是(layers*direction, batch,hidden)**，单向RNN的direction为1，双向为2\n",
    "- 上述编码器将输入维度(1,10)向量转换为(1,10,64)的向量，隐藏状态维度(1,1,64)\n",
    "- **上述方法中将layer层数定义在RNN外，这样只有一个隐藏状态，维度为(1*direction, batch,hidden)而定义在RNN内部的层数，最终隐藏状态维度为(layers*direction, batch,hidden)**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 定义解码器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DecoderRNN(nn.Module):\n",
    "    def __init__(self, vocab_size, hidden_size, n_layers=1):\n",
    "        \"\"\"\n",
    "        hidden_size: the size of the hidden state\n",
    "        vocab_size: the size of the vocabulary \n",
    "        n_layers: the number of the RNN layers\n",
    "        \"\"\"\n",
    "        super().__init__()\n",
    "        self.n_layers = n_layers\n",
    "        self.hidden_size = hidden_size\n",
    "        \n",
    "        # embedding the word (N,1) => (N,1,embedding_size)\n",
    "        embedding_size = hidden_size\n",
    "        self.embedding = nn.Embedding(vocab_size, embedding_size)\n",
    "        self.gru = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, batch_first=True) \n",
    "        self.linear  = nn.Linear(hidden_size, vocab_size)       # (N,1,vocab_size) output dims equal to vocabulary size\n",
    "        self.softmax = nn.LogSoftmax(dim=2)                     # compute log probabilities\n",
    "        \n",
    "    def forward(self, input_t, hidden_t):\n",
    "        embedded = self.embedding(input_t)                            # (N,1,embedding_size)\n",
    "        for i in range(self.n_layers):\n",
    "            embedded = F.relu(embedded)                               # ??? relu         \n",
    "            embedded, hidden_t = self.gru(embedded, hidden_t)\n",
    "#         print(embedded.shape)                                       # (N,1,embedding_size)\n",
    "        output = self.softmax(self.linear(embedded))                  # (N,1,vocab_size)\n",
    "        return output, hidden_t\n",
    "    \n",
    "    def initHidden(self):\n",
    "        hidden_0 = torch.zeros([1, 1, self.hidden_size])              # give the state of the h0\n",
    "        return hidden_0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.Input tensor: torch.Size([1, 1])\n",
      "2.output and hidden shape: torch.Size([1, 1, 1000]) torch.Size([1, 1, 128])\n"
     ]
    }
   ],
   "source": [
    "# test the decode model\n",
    "test_in = torch.randint(1,1000, [1,1]).long().to(device)        # input before Embedding must be LongTensor \n",
    "print('1.Input tensor:', test_in.shape)\n",
    "decoder = DecoderRNN(1000, 128, 2).to(device)\n",
    "hidden_0 = decoder.initHidden().to(device)\n",
    "output, hidden = decoder(test_in, hidden_0)                      # output:(batch,seq,hidden*direction)\n",
    "print('2.output and hidden shape:', output.shape, hidden.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "总结：\n",
    "\n",
    "注意：解码器是对编码器生成的context隐藏向量进行解码，所以处理的句子长度为1，seq=1，**hidden维度大小为(batch,1,hidden)**\n",
    "语言建模的句子长度可以不为1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 定义有attention机制的解码器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "class AttnDecoderRNN(nn.Module):\n",
    "    def __init__(self, vocab_size, hidden_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):\n",
    "        \"\"\"\n",
    "        func: \n",
    "        hidden_size: the size of the hidden state\n",
    "        vocab_size: the size of the vocabulary\n",
    "        n_layers: the number of the RNN layers\n",
    "        dropout_p: probability of an element to be zeroed\n",
    "        max_length: the maximum length of the sequence\n",
    "        \"\"\"\n",
    "        super().__init__()\n",
    "        self.hidden_size = hidden_size\n",
    "        self.vocab_size = vocab_size\n",
    "        self.n_layers = n_layers\n",
    "#         self.dropout_p = dropout_p\n",
    "        self.max_length = max_length\n",
    "        \n",
    "        # embedding the word (N,seq) => (N,seq,hidden_size)\n",
    "        self.embedding = nn.Embedding(vocab_size, hidden_size) \n",
    "        self.attn = nn.Linear(self.hidden_size*2, self.max_length)           # input_dim = hidden*2，equal to cat dim\n",
    "        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)  # restore to hidden size\n",
    "        self.dropout = nn.Dropout(dropout_p)                                 # regularization\n",
    "        self.gru = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, batch_first=True)\n",
    "        self.out = nn.Linear(hidden_size, vocab_size)                        # # (N,seq,vocab_size)\n",
    "        \n",
    "    def forward(self, input_t, hidden_t, encoder_ouputs):\n",
    "        \"\"\"\n",
    "        input_t: (N, seq), N=1, seq=1\n",
    "        hidden_t: (1*direction, N, hidden_size) , here we use unidirectional GRU=>direction=1\n",
    "        encoder_ouputs:  (N,length, hidden) ???, equal to (N,seq,hidden)???\n",
    "        \n",
    "        \"\"\"\n",
    "        # embedding the word into dense vector, here embedding_size = hidden_size\n",
    "        embedded = self.embedding(input_t)                  # get (N,seq,embedding_size)  N=1\n",
    "        embedded = self.dropout(embedded)                   # !!!RNN中的正则化方法\n",
    "        \n",
    "        # compute the attention weight\n",
    "        # embedded:(N,seq,hidden), here seq=1  |  hidden_t:(1,N,hidzden)    ???seq不唯一怎么处理???\n",
    "        embedded_cat = torch.cat([embedded, hidden_t.permute(1,0,2)], 2)   # get (N,1,2*hidden)\n",
    "        attn_weights = F.softmax(self.attn(embedded_cat), dim=2)           # get (N,1,max_length)\n",
    "        \n",
    "        # (N,1,max_length)*(N,max_length,hidden) => (N,1,hidden)           batch multiply\n",
    "        attn_applied = torch.matmul(attn_weights, encoder_ouputs)          # get (N,1,hidden)\n",
    "        weight_out = torch.cat([embedded, attn_applied], 2)                # get (N,1,2*hidden)\n",
    "        weight_out = self.attn_combine(weight_out)                         # get (N,1,hidden)\n",
    "             \n",
    "        # put data into gru\n",
    "        for i in range(self.n_layers):\n",
    "            weight_out = F.relu(weight_out)                                # ??? \n",
    "            weight_out, hidden_t = self.gru(weight_out, hidden_t)\n",
    "            print (weight_out.shape)\n",
    "        \n",
    "        output = F.log_softmax(self.out(weight_out), dim=2)                       # get (N,1,vocab_size)\n",
    "        return output, hidden_t, attn_weights\n",
    "    \n",
    "    def initHidden(self):\n",
    "        hidden_0 = torch.zeros([1,1,self.hidden_size])\n",
    "        return hidden_0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.Input tensor: torch.Size([1, 1])\n",
      "torch.Size([1, 1, 128])\n",
      "torch.Size([1, 1, 128])\n",
      "2.output and hidden shape: torch.Size([1, 1, 1000]) torch.Size([1, 1, 128]) torch.Size([1, 1, 10])\n"
     ]
    }
   ],
   "source": [
    "# test the decode model\n",
    "test_in = torch.randint(1,1000, [1,1]).long().to(device)        # input before Embedding must be LongTensor \n",
    "print('1.Input tensor:', test_in.shape)\n",
    "decoder = AttnDecoderRNN(1000, 128, 2).to(device)\n",
    "hidden_0 = decoder.initHidden().to(device)\n",
    "encoder_outputs = torch.randn([1,MAX_LENGTH,128]).to(device)\n",
    "output, hidden, attn_weight = decoder(test_in, hidden_0, encoder_outputs)                      # output:(batch,seq,hidden*direction)\n",
    "print('2.output and hidden shape:', output.shape, hidden.shape, attn_weight.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 存在疑问？？\n",
    "1.`torch.cat([embedded, hidden_t.permute(1,0,2)], 2)`，embedded:(N,seq,hidden), 其中seq为1，hidden_t:(1,N,hidden)当前仅当seq=1时可进行特征拼接，训练时如果seq大于1怎么处理？\n",
    "\n",
    "- ?\n",
    "\n",
    "\n",
    "2.解码的输入具体怎么对应到编码器的输出？input_t, hidden_t, encoder_ouputs分别对应哪个？\n",
    "\n",
    "\n",
    "- 在解码器中input_t使用`decoder_input = torch.LongTensor([[SOS_token]]).to(device)`来获取，<SOS>起始符表示开始解码，每次只解码一个单词，所以seq=1，这里使用teacher forcing方法，即训练时使用标签直接输入解码器，而不是根据上个时刻的输出来得到输入，所以input_t会随着句子解码的进行而变化；hidden_t表示编码器得到的隐藏状态，维度为(N,seq,hidden)然后直接输入到解码器中；encoder_outputs是根据编码器的输出得到，由于每个批次句子长短不一，而最大长度为10，这里encoder_outputs维度是(N,10,hidden)，当句子长短小于10时，其他不够的地方直接使用0来表示\n",
    "\n",
    "\n",
    "3.self.attn(embedded_cat)输入的句子中没有进行padding操作，但是如果attention权重的长度为max_length，那没有句子且没有进行padding的部分也会分配attention权重概率，怎么处理?\n",
    "\n",
    "\n",
    "- 每次编码前先定义一个(N,max_length,hidden)大小的零向量encoder_outputs，每次将编码器的输出encoder_output，本项目中的维度为(N,seq,hidden)，其中seq=1，将encoder_output向量按当前批次的句子大小按顺序赋值给encoder_ouputs，这样权重在比当前句子长的部分上的权重概率是0，即没有分配权重，这样解决了该问题\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
